Data Visualization with Pandas and Matplotlib

# import library 
import pandas as pd 
import matplotlib.pyplot as plt 

# display plot in the notebook 
%matplotlib inline 

# set figuresize and fontsize 
plt.rcParams['figure.figsize'] = (8,6) 
plt.rcParams['font.size'] = 14 
# read data 
drink_cols = ["country", 'beer', 'spirit', 'wine', 'liters', 'continent']
drinks = pd.read_csv("../data/drinks.csv", header=0, names=drink_cols, na_filter=False)

Data Exploration

# examine first few rows 
country beer spirit wine liters continent
0 Afghanistan 0 0 0 0.0 AS
1 Albania 89 132 54 4.9 EU
2 Algeria 25 0 14 0.7 AF
3 Andorra 245 138 312 12.4 EU
4 Angola 217 57 45 5.9 AF
# observations and columns 
(193, 6)
# data structure 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    193 non-null    object 
 1   beer       193 non-null    int64  
 2   spirit     193 non-null    int64  
 3   wine       193 non-null    int64  
 4   liters     193 non-null    float64
 5   continent  193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
# numerical summary 
beer spirit wine liters
count 193.000000 193.000000 193.000000 193.000000
mean 106.160622 80.994819 49.450777 4.717098
std 101.143103 88.284312 79.697598 3.773298
min 0.000000 0.000000 0.000000 0.000000
25% 20.000000 4.000000 1.000000 1.300000
50% 76.000000 56.000000 8.000000 4.200000
75% 188.000000 128.000000 59.000000 7.200000
max 376.000000 438.000000 370.000000 14.400000

Histogram: show the distribution of a numerical variable

# sort the beer columns and split it into 3 groups
array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   1,   1,   1,   1,   2,   3,   5,   5,   5,   5,   5,
         6,   6,   6,   6,   8,   8,   8,   9,   9,   9,   9,  12,  13,
        15,  15,  16,  16,  17,  18,  19,  19,  20,  20,  21,  21,  21,
        21,  22,  23,  25,  25,  25,  25,  26,  28,  31,  31,  31,  31,
        32,  32,  34,  36,  36,  36,  37,  42,  42,  43,  44,  45,  47,
        49,  51,  51,  52,  52,  52,  53,  56,  56,  57,  58,  60,  62,
        62,  63,  64,  69,  71,  76,  76,  77,  77,  77,  78,  79,  82,
        82,  85,  88,  89,  90,  92,  93,  93,  98,  99, 102, 105, 106,
       109, 111, 115, 120, 122, 124, 127, 128, 130, 133, 140, 142, 143,
       144, 147, 149, 149, 152, 157, 159, 162, 163, 167, 169, 171, 173,
       185, 188, 192, 193, 193, 194, 194, 196, 197, 199, 203, 206, 213,
       217, 219, 224, 224, 225, 230, 231, 233, 234, 236, 238, 240, 245,
       245, 247, 249, 251, 261, 263, 263, 270, 279, 281, 283, 284, 285,
       295, 297, 306, 313, 333, 343, 343, 346, 347, 361, 376])
# compare with histogram"hist", bins=3);
../_images/MPL02-Data Visualization with Pandas and Matplotlib_10_0.png
# try more bins"hist", bins=20); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_11_0.png
# add title and labels"hist", bins=20, title="Histogram of Beer Servings")
plt.xlabel("Beer Survings") 
# show plot 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_12_0.png
# compare with density plot(smooth version of a histogram)"density", xlim=(0, 500));
../_images/MPL02-Data Visualization with Pandas and Matplotlib_13_0.png

Scatter Plot: show the relationship between two numerical variables

# select the beer and wine columns and sort by beer 
drinks[["beer", "wine"]].sort_values(by="beer").values
array([[  0,   0],
       [  0,  74],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  1,   7],
       [  1,   1],
       [  1,   4],
       [  1,   1],
       [  2,   0],
       [  3,   1],
       [  5,   0],
       [  5,   0],
       [  5,  16],
       [  5,   1],
       [  5,   0],
       [  6,   1],
       [  6,   0],
       [  6,   1],
       [  6,   9],
       [  8,   0],
       [  8,   1],
       [  8,   1],
       [  9,   2],
       [  9,   0],
       [  9,   7],
       [  9,   0],
       [ 12,  10],
       [ 13,   0],
       [ 15,   3],
       [ 15,   1],
       [ 16,   5],
       [ 16,   0],
       [ 17,   1],
       [ 18,   0],
       [ 19,  32],
       [ 19,   2],
       [ 20,   0],
       [ 20,  31],
       [ 21,  11],
       [ 21,  11],
       [ 21,   5],
       [ 21,   1],
       [ 22,   1],
       [ 23,   0],
       [ 25,   8],
       [ 25,  14],
       [ 25,   2],
       [ 25,   7],
       [ 26,   4],
       [ 28,  21],
       [ 31, 128],
       [ 31,   6],
       [ 31,  10],
       [ 31,   1],
       [ 32,   4],
       [ 32,   1],
       [ 34,  13],
       [ 36,  19],
       [ 36,   5],
       [ 36,   1],
       [ 37,   7],
       [ 42,   2],
       [ 42,   7],
       [ 43,   0],
       [ 44,   1],
       [ 45,   0],
       [ 47,   5],
       [ 49,   8],
       [ 51,  20],
       [ 51,   7],
       [ 52,   2],
       [ 52, 149],
       [ 52,  26],
       [ 53,   2],
       [ 56, 140],
       [ 56,   1],
       [ 57,   1],
       [ 58,   2],
       [ 60,  11],
       [ 62,  18],
       [ 62, 123],
       [ 63,   9],
       [ 64,   4],
       [ 69,   2],
       [ 71,   1],
       [ 76,   8],
       [ 76,   9],
       [ 77,   8],
       [ 77,  16],
       [ 77,   1],
       [ 78,   1],
       [ 79,   8],
       [ 82,   9],
       [ 82,   0],
       [ 85, 237],
       [ 88,   0],
       [ 89,  54],
       [ 90,   2],
       [ 92, 233],
       [ 93,   5],
       [ 93,   1],
       [ 98,  18],
       [ 99,   1],
       [102,  45],
       [105,  24],
       [106,  86],
       [109,  18],
       [111,   1],
       [115, 220],
       [120,  11],
       [122,  51],
       [124,  12],
       [127, 370],
       [128,   7],
       [130, 172],
       [133, 218],
       [140,   9],
       [142,  42],
       [143,  36],
       [144,  16],
       [147,   4],
       [149, 120],
       [149,  11],
       [152, 186],
       [157,  51],
       [159,   3],
       [162,   3],
       [163,  21],
       [167,   8],
       [169, 129],
       [171,  71],
       [173,  35],
       [185, 280],
       [188,   7],
       [192, 113],
       [193,   9],
       [193, 221],
       [194, 339],
       [194,  32],
       [196, 116],
       [197,   7],
       [199,  28],
       [203, 175],
       [206,  45],
       [213,  74],
       [217,  45],
       [219, 195],
       [224,  59],
       [224, 278],
       [225,  81],
       [230, 254],
       [231,  94],
       [233,  78],
       [234, 185],
       [236, 271],
       [238,   5],
       [240, 100],
       [245, 312],
       [245,  16],
       [247,  73],
       [249,  84],
       [251, 190],
       [261, 212],
       [263,  97],
       [263,   8],
       [270, 276],
       [279, 191],
       [281,  62],
       [283, 127],
       [284, 112],
       [285,  18],
       [295, 212],
       [297, 167],
       [306,  23],
       [313, 165],
       [333,   3],
       [343,  56],
       [343,  56],
       [346, 175],
       [347,  59],
       [361, 134],
       [376,   1]])
# comapre with scatter plot 
drinks.plot(kind="scatter", x="beer", y="wine"); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_16_0.png
# add transparency 
drinks.plot(kind='scatter', x="beer", y="wine", alpha=0.3); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_17_0.png
# vary point color by spirit servings 
drinks.plot(kind="scatter", x="beer", y="wine", c="spirit", colormap="Blues"); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_18_0.png
# scatter matrix of 3 numerical columns 
pd.plotting.scatter_matrix(drinks[['beer', 'wine', 'spirit']]); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_19_0.png
# increase figure size 
# scatter matrix of 3 numerical columns 
pd.plotting.scatter_matrix(drinks[['beer', 'wine', 'spirit']], figsize=(10,8)); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_20_0.png

Bar Plot: show a numerical comparison across different categories

# count the number of countries in each continent 
AF    53
EU    45
AS    44
NA    23
OC    16
SA    12
Name: continent, dtype: int64
# compare with bar plot 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_23_0.png
# calculate the mean alcohol amounts for each continent 
beer spirit wine liters
AF 61.471698 16.339623 16.264151 3.007547
AS 37.045455 60.840909 9.068182 2.170455
EU 193.777778 132.555556 142.222222 8.617778
NA 145.434783 165.739130 24.521739 5.995652
OC 89.687500 58.437500 35.625000 3.381250
SA 175.083333 114.750000 62.416667 6.308333
# side-by-side bar plots 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_25_0.png
# drop the liters column
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_26_0.png
# stacked bar plots 
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_27_0.png

Box Plot: show quartiles (and outliers) for one or more numerical variables

Five-Number Summary

  • min = minimum value

  • 5% = first quartile (Q1) = median of the lower half of the data

  • 50% = second quartile (Q2) = median of the data

  • 75% = third quartile (Q3) = median of the upper half of the data

  • max = maximum value (More useful than mean and standard deviation for describing skewed distributions)

  • Interquartile Range (IQR) = Q3 - Q1


  • below Q1 - 1.5 * IQR

  • above Q3 + 1.5 * IQR

# sort the spirit column 
array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,
         3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   5,   5,
         6,   6,   6,   7,   9,  11,  11,  12,  13,  15,  15,  16,  16,
        18,  18,  18,  18,  19,  21,  21,  22,  22,  25,  25,  27,  29,
        31,  31,  34,  35,  35,  35,  35,  38,  39,  41,  41,  42,  42,
        44,  46,  50,  51,  55,  56,  57,  60,  61,  63,  63,  65,  67,
        68,  69,  69,  69,  71,  71,  72,  74,  75,  76,  76,  79,  81,
        84,  87,  87,  88,  97,  97,  98,  98, 100, 100, 100, 100, 101,
       104, 104, 112, 114, 114, 114, 117, 117, 118, 118, 122, 122, 124,
       126, 128, 131, 132, 133, 133, 135, 137, 138, 145, 147, 151, 152,
       154, 156, 157, 158, 160, 170, 173, 173, 176, 178, 179, 186, 189,
       192, 194, 200, 202, 205, 215, 215, 216, 221, 226, 237, 244, 246,
       252, 254, 258, 286, 293, 302, 315, 326, 326, 373, 438])
# show five-number summary of spirit 
count    193.000000
mean      80.994819
std       88.284312
min        0.000000
25%        4.000000
50%       56.000000
75%      128.000000
max      438.000000
Name: spirit, dtype: float64
# compare with boxplot 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_31_0.png
# include multiple variables 
drinks.drop('liters', axis=1).plot(kind='box'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_32_0.png

Line Plot: show the trend of a numerical variable over time

# read ufo data 
ufo = pd.read_csv("../data/ufo.csv")
ufo['Time'] = pd.to_datetime(ufo.Time) 
ufo['Year'] = ufo.Time.dt.year 
# examine first few rows  
City Colors Reported Shape Reported State Time Year
0 Ithaca NaN TRIANGLE NY 1930-06-01 22:00:00 1930
1 Willingboro NaN OTHER NJ 1930-06-30 20:00:00 1930
2 Holyoke NaN OVAL CO 1931-02-15 14:00:00 1931
3 Abilene NaN DISK KS 1931-06-01 13:00:00 1931
4 New York Worlds Fair NaN LIGHT NY 1933-04-18 19:00:00 1933
# observations and columns 
(80543, 6)
# data structure 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80543 entries, 0 to 80542
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   City             80496 non-null  object        
 1   Colors Reported  17034 non-null  object        
 2   Shape Reported   72141 non-null  object        
 3   State            80543 non-null  object        
 4   Time             80543 non-null  datetime64[ns]
 5   Year             80543 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 3.7+ MB
# numerical summary 
count 80543.000000
mean 2004.178737
std 10.602487
min 1930.000000
25% 2001.000000
50% 2007.000000
75% 2011.000000
max 2014.000000
# count the number of ufo reports each year (and sort by year)
1930       2
1931       2
1933       1
1934       1
1935       1
2010    4154
2011    5089
2012    7263
2013    7003
2014    5382
Name: Year, Length: 82, dtype: int64
# compare with line plot 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_40_0.png
# don't use a line plot when there is no logical ordering 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_41_0.png

Grouped Box Plots: show one box plot for each group

# remainder: boxplot of beer survings'box'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_43_0.png
# boxplot of beer survings group by continent 
drinks.boxplot(column='beer', by='continent'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_44_0.png
# boxplot of all numerical columns group by continent 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_45_0.png

Grouped Histograms: show one histogram for each group

# remainder: histogram of beer survings'hist', bins=20); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_47_0.png
# histogram of beer  survings group by continent 
drinks.hist(column='beer', by='continent'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_48_0.png
# share the x-axis 
drinks.hist(column='beer', by='continent', sharex=True); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_49_0.png
# share the x and y axis 
drinks.hist(column='beer', by='continent', sharex=True, sharey=True); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_50_0.png
# change the layout 
drinks.hist(column='beer', by='continent', sharex=True, layout=(2, 3));
../_images/MPL02-Data Visualization with Pandas and Matplotlib_51_0.png

Assorted Functionality

# saving a plot to a file'hist', bins=20, title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.savefig("beer_survings.png") # .png, .tiff, .pdf, .jpeg 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_53_0.png
# list available plot style
# use plot style: ggplot'ggplot')
# histogram of beer survings in ggplot style"hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
Text(0, 0.5, 'Frequnecy')
../_images/MPL02-Data Visualization with Pandas and Matplotlib_56_1.png
# use plot style: ggplot'seaborn') 
# histogram of beer survings in seaborn style"hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
Text(0, 0.5, 'Frequnecy')
../_images/MPL02-Data Visualization with Pandas and Matplotlib_58_1.png
# use plot style: ggplot'fivethirtyeight') 
# histogram of beer survings in fivethirtyeight style"hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
Text(0, 0.5, 'Frequnecy')
../_images/MPL02-Data Visualization with Pandas and Matplotlib_60_1.png